{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# importing required libraries\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('dataset/twitter_sentiments.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "      <th>tweet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>@user when a father is dysfunctional and is s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>@user @user thanks for #lyft credit i can't us...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>bihday your majesty</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>#model   i love u take with u all the time in ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>factsguide: society now    #motivation</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  label                                              tweet\n",
       "0   1      0   @user when a father is dysfunctional and is s...\n",
       "1   2      0  @user @user thanks for #lyft credit i can't us...\n",
       "2   3      0                                bihday your majesty\n",
       "3   4      0  #model   i love u take with u all the time in ...\n",
       "4   5      0             factsguide: society now    #motivation"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(31962, 3)"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    29720\n",
       "1     2242\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(data, test_size = 0.2, stratify = data['label'], random_state=21)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((25569, 3), (6393, 3))"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape, test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.929837\n",
       "1    0.070163\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.label.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.929923\n",
       "1    0.070077\n",
       "Name: label, dtype: float64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.label.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "                dtype=<class 'numpy.float64'>, encoding='utf-8',\n",
       "                input='content', lowercase=True, max_df=1.0, max_features=1000,\n",
       "                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,\n",
       "                smooth_idf=True,\n",
       "                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',\n",
       "                                      'afterwards', 'again', 'against', 'all',\n",
       "                                      'almost', 'alone', 'along', 'already',\n",
       "                                      'also', 'although', 'always', 'am',\n",
       "                                      'among', 'amongst', 'amoungst', 'amount',\n",
       "                                      'an', 'and', 'another', 'any', 'anyhow',\n",
       "                                      'anyone', 'anything', 'anyway',\n",
       "                                      'anywhere', ...}),\n",
       "                strip_accents=None, sublinear_tf=False,\n",
       "                token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
       "                vocabulary=None)"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_vectorizer.fit(train.tweet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_idf = tfidf_vectorizer.transform(train.tweet)\n",
    "test_idf  = tfidf_vectorizer.transform(test.tweet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_LR = LogisticRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
       "                   multi_class='auto', n_jobs=None, penalty='l2',\n",
       "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                   warm_start=False)"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_LR.fit(train_idf, train.label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "predict_train = model_LR.predict(train_idf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "predict_test = model_LR.predict(test_idf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4888178913738019"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# f1 score on train data\n",
    "f1_score(y_true= train.label, y_pred= predict_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.45751633986928114"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f1_score(y_true= test.label, y_pred= predict_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,\n",
    "                                                      max_features=1000,\n",
    "                                                      stop_words= ENGLISH_STOP_WORDS)),\n",
    "                            ('model', LogisticRegression())])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "         steps=[('tfidf',\n",
       "                 TfidfVectorizer(analyzer='word', binary=False,\n",
       "                                 decode_error='strict',\n",
       "                                 dtype=<class 'numpy.float64'>,\n",
       "                                 encoding='utf-8', input='content',\n",
       "                                 lowercase=True, max_df=1.0, max_features=1000,\n",
       "                                 min_df=1, ngram_range=(1, 1), norm='l2',\n",
       "                                 preprocessor=None, smooth_idf=True,\n",
       "                                 stop_words=frozenset({'a', 'about', 'above',\n",
       "                                                       'across', 'after',\n",
       "                                                       'afterward...\n",
       "                                 strip_accents=None, sublinear_tf=False,\n",
       "                                 token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
       "                                 tokenizer=None, use_idf=True,\n",
       "                                 vocabulary=None)),\n",
       "                ('model',\n",
       "                 LogisticRegression(C=1.0, class_weight=None, dual=False,\n",
       "                                    fit_intercept=True, intercept_scaling=1,\n",
       "                                    l1_ratio=None, max_iter=100,\n",
       "                                    multi_class='auto', n_jobs=None,\n",
       "                                    penalty='l2', random_state=None,\n",
       "                                    solver='lbfgs', tol=0.0001, verbose=0,\n",
       "                                    warm_start=False))],\n",
       "         verbose=False)"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline.fit(train.tweet, train.label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, ..., 0, 0, 0])"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline.predict(train.tweet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = [\"Virat Kohli, AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0])"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline.predict(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "from joblib import dump"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['text_classification.joblib']"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dump(pipeline, filename=\"text_classification.joblib\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>label</th>\n",
       "      <th>tweet</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>1</td>\n",
       "      <td>@user #cnn calls #michigan middle school 'buil...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>no comment!  in #australia   #opkillingbay #se...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>1</td>\n",
       "      <td>retweet if you agree!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>1</td>\n",
       "      <td>@user @user lumpy says i am a . prove it lumpy.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>1</td>\n",
       "      <td>it's unbelievable that in the 21st century we'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31934</th>\n",
       "      <td>31935</td>\n",
       "      <td>1</td>\n",
       "      <td>lady banned from kentucky mall. @user  #jcpenn...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31946</th>\n",
       "      <td>31947</td>\n",
       "      <td>1</td>\n",
       "      <td>@user omfg i'm offended! i'm a  mailbox and i'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31947</th>\n",
       "      <td>31948</td>\n",
       "      <td>1</td>\n",
       "      <td>@user @user you don't have the balls to hashta...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31948</th>\n",
       "      <td>31949</td>\n",
       "      <td>1</td>\n",
       "      <td>makes you ask yourself, who am i? then am i a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31960</th>\n",
       "      <td>31961</td>\n",
       "      <td>1</td>\n",
       "      <td>@user #sikh #temple vandalised in in #calgary,...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2242 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          id  label                                              tweet\n",
       "13        14      1  @user #cnn calls #michigan middle school 'buil...\n",
       "14        15      1  no comment!  in #australia   #opkillingbay #se...\n",
       "17        18      1                             retweet if you agree! \n",
       "23        24      1    @user @user lumpy says i am a . prove it lumpy.\n",
       "34        35      1  it's unbelievable that in the 21st century we'...\n",
       "...      ...    ...                                                ...\n",
       "31934  31935      1  lady banned from kentucky mall. @user  #jcpenn...\n",
       "31946  31947      1  @user omfg i'm offended! i'm a  mailbox and i'...\n",
       "31947  31948      1  @user @user you don't have the balls to hashta...\n",
       "31948  31949      1   makes you ask yourself, who am i? then am i a...\n",
       "31960  31961      1  @user #sikh #temple vandalised in in #calgary,...\n",
       "\n",
       "[2242 rows x 3 columns]"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[data.label == 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
